library(foreign)
library(stringr)
library(maptools)
library(chron)

# returns string w/o leading whitespace
trim.leading <- function (x)  sub("^\\s+", "", x)

# returns string w/o trailing whitespace
trim.trailing <- function (x) sub("\\s+$", "", x)

# returns string w/o leading or trailing whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)


d13<-read.csv(file="raw/2013.csv", header=T)
dim(d13)
names(d13)

table(d13$dob, exclude=NULL)


d13$birth_day<-NA
d13$birth_month<-NA
d13$birth_year<-NA



names(d13)[names(d13)=="dettypCM"]<-"dettypcm"
names(d13)[names(d13)=="lineCM"]<-"linecm"
names(d13)[names(d13)=="detailCM"]<-"detailcm"


d03<-read.csv(file="raw/2003.csv", header=T)
table(names(d03)%in%names(d13))
table(is.na(d03$timestop))

table(d03$dob, exclude=NULL)
d03$dob<-trim(as.character(d03$dob))
table(nchar(d03$dob))
table(d03$dob[nchar(d03$dob)==7 & !is.na(d03$dob)])
table(d03$dob[nchar(d03$dob)==8 & !is.na(d03$dob)])


d03$birth_year<-NA
d03$birth_year<-substr(start=nchar(d03$dob)-3, stop=nchar(d03$dob), x=d03$dob )
table(d03$birth_year)

d03$birth_month<-NA
d03$birth_month[nchar(d03$dob)==7 & !is.na(d03$dob)]<-substr(start=1, stop=1, x=d03$dob[nchar(d03$dob)==7& !is.na(d03$dob)] )
d03$birth_month[nchar(d03$dob)==8& !is.na(d03$dob)]<-substr(start=1, stop=2, x=d03$dob[nchar(d03$dob)==8& !is.na(d03$dob)] )
table(d03$birth_month)


d03$birth_day<-NA
d03$birth_day[ nchar(d03$dob)==7 &!is.na(d03$dob)]<-substr(start=2, stop=3, x=d03$dob[nchar(d03$dob)==7& !is.na(d03$dob)] )
d03$birth_day[ nchar(d03$dob)==8 &!is.na(d03$dob)]<-substr(start=3, stop=4, x=d03$dob[nchar(d03$dob)==8& !is.na(d03$dob)] )
table(d03$birth_day)

d04<-read.csv(file="raw/2004.csv", header=T)
d04$datestop[d04$datestop=="12311900"]<-"12312004"
table(is.na(d04$timestop))


d04$dob<-trim(as.character(d04$dob))
table(nchar(d04$dob))
table(d04$dob[nchar(d04$dob)==7 & !is.na(d04$dob)])
table(d04$dob[nchar(d04$dob)==8 & !is.na(d04$dob)])


d04$birth_year<-NA
d04$birth_year<-substr(start=nchar(d04$dob)-3, stop=nchar(d04$dob), x=d04$dob )
table(d04$birth_year)

d04$birth_month<-NA
d04$birth_month[nchar(d04$dob)==7 & !is.na(d04$dob)]<-substr(start=1, stop=1, x=d04$dob[nchar(d04$dob)==7& !is.na(d04$dob)] )
d04$birth_month[nchar(d04$dob)==8& !is.na(d04$dob)]<-substr(start=1, stop=2, x=d04$dob[nchar(d04$dob)==8& !is.na(d04$dob)] )
table(d04$birth_month)


d04$birth_day<-NA
d04$birth_day[ nchar(d04$dob)==7 &!is.na(d04$dob)]<-substr(start=2, stop=3, x=d04$dob[nchar(d04$dob)==7& !is.na(d04$dob)] )
d04$birth_day[ nchar(d04$dob)==8 &!is.na(d04$dob)]<-substr(start=3, stop=4, x=d04$dob[nchar(d04$dob)==8& !is.na(d04$dob)] )
table(d04$birth_day)


d05<-read.csv(file="raw/2005.csv", header=T)
table(is.na(d05$timestop))

d05$dob<-trim(as.character(d05$dob))
table(nchar(d05$dob))
table(d05$dob[nchar(d05$dob)==7 & !is.na(d05$dob)])
table(d05$dob[nchar(d05$dob)==8 & !is.na(d05$dob)])


d05$birth_year<-NA
d05$birth_year<-substr(start=nchar(d05$dob)-3, stop=nchar(d05$dob), x=d05$dob )
table(d05$birth_year)

d05$birth_month<-NA
d05$birth_month[nchar(d05$dob)==7 & !is.na(d05$dob)]<-substr(start=1, stop=1, x=d05$dob[nchar(d05$dob)==7& !is.na(d05$dob)] )
d05$birth_month[nchar(d05$dob)==8& !is.na(d05$dob)]<-substr(start=1, stop=2, x=d05$dob[nchar(d05$dob)==8& !is.na(d05$dob)] )
table(d05$birth_month)


d05$birth_day<-NA
d05$birth_day[ nchar(d05$dob)==7 &!is.na(d05$dob)]<-substr(start=2, stop=3, x=d05$dob[nchar(d05$dob)==7& !is.na(d05$dob)] )
d05$birth_day[ nchar(d05$dob)==8 &!is.na(d05$dob)]<-substr(start=3, stop=4, x=d05$dob[nchar(d05$dob)==8& !is.na(d05$dob)] )
table(d05$birth_day)

table(names(d05)%in%names(d13))
d05$datestop[d05$datestop=="12311900"]<-"12312005"
table(is.na(d05$timestop))


d06<-read.csv(file="raw/2006.csv", header=T)
d06$datestop[d06$datestop=="1900-12-31"]<-"2006-12-31"
table(is.na(d06$timestop))



d06$dob<-trim(as.character(d06$dob))
table(nchar(d06$dob))
table(d06$dob[nchar(d06$dob)==10 & !is.na(d06$dob)])


d06$birth_year<-NA
d06$birth_year<-substr(start=1, stop=4, x=d06$dob )
table(d06$birth_year)

d06$birth_month<-NA
d06$birth_month[ !is.na(d06$dob)]<-substr(start=6, stop=7, x=d06$dob[ !is.na(d06$dob)] )
table(d06$birth_month)


d06$birth_day<-NA
d06$birth_day[ !is.na(d06$dob)]<-substr(start=9, stop=10, x=d06$dob[ !is.na(d06$dob)] )
table(d06$birth_day)



table(names(d06)%in%names(d13))

d07<-read.csv(file="raw/2007.csv", header=T)
d07$datestop[d07$datestop=="12311900"]<-"12312007"
table(is.na(d07$timestop))


d07$dob<-trim(as.character(d07$dob))
table(nchar(d07$dob))

table(d07$dob[nchar(d07$dob)==7 & !is.na(d07$dob)])
table(d07$dob[nchar(d07$dob)==8 & !is.na(d07$dob)])


d07$birth_year<-NA
d07$birth_year<-substr(start=nchar(d07$dob)-3, stop=nchar(d07$dob), x=d07$dob )
table(d07$birth_year)

d07$birth_month<-NA
d07$birth_month[nchar(d07$dob)==7 & !is.na(d07$dob)]<-substr(start=1, stop=1, x=d07$dob[nchar(d07$dob)==7& !is.na(d07$dob)] )
d07$birth_month[nchar(d07$dob)==8& !is.na(d07$dob)]<-substr(start=1, stop=2, x=d07$dob[nchar(d07$dob)==8& !is.na(d07$dob)] )
table(d07$birth_month)


d07$birth_day<-NA
d07$birth_day[ nchar(d07$dob)==7 &!is.na(d07$dob)]<-substr(start=2, stop=3, x=d07$dob[nchar(d07$dob)==7& !is.na(d07$dob)] )
d07$birth_day[ nchar(d07$dob)==8 &!is.na(d07$dob)]<-substr(start=3, stop=4, x=d07$dob[nchar(d07$dob)==8& !is.na(d07$dob)] )
table(d07$birth_day)







table(names(d07)%in%names(d13))
d08<-read.csv(file="raw/2008.csv", header=T)
d08$datestop[d08$datestop=="12311900"]<-"12312008"
table(is.na(d08$timestop))


d08$dob<-trim(as.character(d08$dob))
table(nchar(d08$dob))

table(d08$dob[nchar(d08$dob)==7 & !is.na(d08$dob)])
table(d08$dob[nchar(d08$dob)==8 & !is.na(d08$dob)])


d08$birth_year<-NA
d08$birth_year<-substr(start=nchar(d08$dob)-3, stop=nchar(d08$dob), x=d08$dob )
table(d08$birth_year)

d08$birth_month<-NA
d08$birth_month[nchar(d08$dob)==7 & !is.na(d08$dob)]<-substr(start=1, stop=1, x=d08$dob[nchar(d08$dob)==7& !is.na(d08$dob)] )
d08$birth_month[nchar(d08$dob)==8& !is.na(d08$dob)]<-substr(start=1, stop=2, x=d08$dob[nchar(d08$dob)==8& !is.na(d08$dob)] )
table(d08$birth_month)


d08$birth_day<-NA
d08$birth_day[ nchar(d08$dob)==7 &!is.na(d08$dob)]<-substr(start=2, stop=3, x=d08$dob[nchar(d08$dob)==7& !is.na(d08$dob)] )
d08$birth_day[ nchar(d08$dob)==8 &!is.na(d08$dob)]<-substr(start=3, stop=4, x=d08$dob[nchar(d08$dob)==8& !is.na(d08$dob)] )
table(d08$birth_day)




table(names(d08)%in%names(d13))
d09<-read.csv(file="raw/2009.csv", header=T)
table(is.na(d09$timestop))

d09$dob<-trim(as.character(d09$dob))
table(nchar(d09$dob))

table(d09$dob[nchar(d09$dob)==7 & !is.na(d09$dob)])
table(d09$dob[nchar(d09$dob)==8 & !is.na(d09$dob)])


d09$birth_year<-NA
d09$birth_year<-substr(start=nchar(d09$dob)-3, stop=nchar(d09$dob), x=d09$dob )
table(d09$birth_year)

d09$birth_month<-NA
d09$birth_month[nchar(d09$dob)==7 & !is.na(d09$dob)]<-substr(start=1, stop=1, x=d09$dob[nchar(d09$dob)==7& !is.na(d09$dob)] )
d09$birth_month[nchar(d09$dob)==8& !is.na(d09$dob)]<-substr(start=1, stop=2, x=d09$dob[nchar(d09$dob)==8& !is.na(d09$dob)] )
table(d09$birth_month)


d09$birth_day<-NA
d09$birth_day[ nchar(d09$dob)==7 &!is.na(d09$dob)]<-substr(start=2, stop=3, x=d09$dob[nchar(d09$dob)==7& !is.na(d09$dob)] )
d09$birth_day[ nchar(d09$dob)==8 &!is.na(d09$dob)]<-substr(start=3, stop=4, x=d09$dob[nchar(d09$dob)==8& !is.na(d09$dob)] )
table(d09$birth_day)


d10<-read.csv(file="raw/2010.csv", header=T)
table(is.na(d10$timestop))

d10$dob<-trim(as.character(d10$dob))
table(nchar(d10$dob))

table(d10$dob[nchar(d10$dob)==7 & !is.na(d10$dob)])
table(d10$dob[nchar(d10$dob)==8 & !is.na(d10$dob)])


d10$birth_year<-NA
d10$birth_year<-substr(start=nchar(d10$dob)-3, stop=nchar(d10$dob), x=d10$dob )
table(d10$birth_year)

d10$birth_month<-NA
d10$birth_month[nchar(d10$dob)==7 & !is.na(d10$dob)]<-substr(start=1, stop=1, x=d10$dob[nchar(d10$dob)==7& !is.na(d10$dob)] )
d10$birth_month[nchar(d10$dob)==8& !is.na(d10$dob)]<-substr(start=1, stop=2, x=d10$dob[nchar(d10$dob)==8& !is.na(d10$dob)] )
table(d10$birth_month)


d10$birth_day<-NA
d10$birth_day[ nchar(d10$dob)==7 &!is.na(d10$dob)]<-substr(start=2, stop=3, x=d10$dob[nchar(d10$dob)==7& !is.na(d10$dob)] )
d10$birth_day[ nchar(d10$dob)==8 &!is.na(d10$dob)]<-substr(start=3, stop=4, x=d10$dob[nchar(d10$dob)==8& !is.na(d10$dob)] )
table(d10$birth_day)

d11<-read.csv(file="raw/2011.csv", header=T)
table(is.na(d11$timestop))

d11$dob<-trim(as.character(d11$dob))
table(nchar(d11$dob))

table(d11$dob[nchar(d11$dob)==7 & !is.na(d11$dob)])
table(d11$dob[nchar(d11$dob)==8 & !is.na(d11$dob)])


d11$birth_year<-NA
d11$birth_year<-substr(start=nchar(d11$dob)-3, stop=nchar(d11$dob), x=d11$dob )
table(d11$birth_year)

d11$birth_month<-NA
d11$birth_month[nchar(d11$dob)==7 & !is.na(d11$dob)]<-substr(start=1, stop=1, x=d11$dob[nchar(d11$dob)==7& !is.na(d11$dob)] )
d11$birth_month[nchar(d11$dob)==8& !is.na(d11$dob)]<-substr(start=1, stop=2, x=d11$dob[nchar(d11$dob)==8& !is.na(d11$dob)] )
table(d11$birth_month)


d11$birth_day<-NA
d11$birth_day[ nchar(d11$dob)==7 &!is.na(d11$dob)]<-substr(start=2, stop=3, x=d11$dob[nchar(d11$dob)==7& !is.na(d11$dob)] )
d11$birth_day[ nchar(d11$dob)==8 &!is.na(d11$dob)]<-substr(start=3, stop=4, x=d11$dob[nchar(d11$dob)==8& !is.na(d11$dob)] )
table(d11$birth_day)


d12<-read.csv(file="raw/2012.csv", header=T)
table(is.na(d12$timestop))

d12$dob<-trim(as.character(d12$dob))
table(nchar(d12$dob))

table(d12$dob[nchar(d12$dob)==7 & !is.na(d12$dob)])
table(d12$dob[nchar(d12$dob)==8 & !is.na(d12$dob)])


d12$birth_year<-NA
d12$birth_year<-substr(start=nchar(d12$dob)-3, stop=nchar(d12$dob), x=d12$dob )
table(d12$birth_year)

d12$birth_month<-NA
d12$birth_month[nchar(d12$dob)==7 & !is.na(d12$dob)]<-substr(start=1, stop=1, x=d12$dob[nchar(d12$dob)==7& !is.na(d12$dob)] )
d12$birth_month[nchar(d12$dob)==8& !is.na(d12$dob)]<-substr(start=1, stop=2, x=d12$dob[nchar(d12$dob)==8& !is.na(d12$dob)] )
table(d12$birth_month)


d12$birth_day<-NA
d12$birth_day[ nchar(d12$dob)==7 &!is.na(d12$dob)]<-substr(start=2, stop=3, x=d12$dob[nchar(d12$dob)==7& !is.na(d12$dob)] )
d12$birth_day[ nchar(d12$dob)==8 &!is.na(d12$dob)]<-substr(start=3, stop=4, x=d12$dob[nchar(d12$dob)==8& !is.na(d12$dob)] )
table(d12$birth_day)


table(names(d12)%in%names(d13))


drop<-names(d06)[!(names(d06)%in%names(d13))]
d06<-d06[,!(names(d06)%in%drop)]
keep<-names(d06)

table(is.na(d13$timestop))


d<-rbind.data.frame(d03[,keep], d04[,keep], d05[,keep], d06[,keep], d07[,keep], d08[,keep], d09[,keep], d10[,keep], d11[,keep], d12[,keep], d13[,keep])
dim(d)

head(d)



force.vars<-c("pf_hands", "pf_grnd" ,"pf_wall", "pf_drwep" ,"pf_ptwep" ,"pf_baton" ,"pf_hcuff" ,"pf_pepsp", "pf_other")
for(i in 1:length(force.vars)){

	print(table(d[,force.vars[i]], exclude=NULL))

}##no missing values

## crosstabs between 'other' force and remaining force levels
for(i in force.vars){
  cat(rep('-', 20), '\n', i, '\n', sep = '')
  print(table(d[,i], d$pf_other, exclude=NULL))
}

## how many cases of 'other' force aren't covered by remaining levels
prop.table(
  table(
    other = d$pf_other,
    covered = rowSums(d[,force.vars[-match('pf_other', force.vars)]] == 'Y') > 0
  ),
  margin = 1
)
## result: 91% of 'other' use of force doesn't even include use of hands
##         so we should probably treat this as 0 (it sounds like fryer does too)

## fryer, an empirical analysis of racial differences in police use of force
## figure 1 caption (use of force types)
##   0: when the police report using no force in a stop and frisk interaction
##   1: using at least hands
##   2: at least pushing a civilian to a wall
##   3: at least using handcuffs
##   4: at least drawing a weapon on a civilian
##   5: at least pushing a civilian to the ground
##   6: at least pointing a weapon at a civilian
##   7: at least using a pepper spray or a baton on a civilian
force.levels <- c(
  ## "pf_other" = NA,  #
  "pf_hands" = 1,
  "pf_wall" = 2,
  "pf_hcuff" = 3,
  "pf_drwep" = 4,
  "pf_grnd" = 5,
  "pf_ptwep" = 6,
  "pf_baton" = 7,
  "pf_pepsp" = 7
)

d$force <- 0
for(i in names(force.levels)){
  d$force <- ifelse(d[,i] == 'Y',
                    force.levels[i],
                    d$force
                    )
}
table(d$force)

d$force2<-0
for(i in 1:length(force.vars)){
d$force2[d[,force.vars[i]]=="Y"]<-1
}
table(d$force2)



table(d$race)
d$asian<-I(d$race=="A")
d$black<-I(d$race=="B"|d$race=="P")
d$hisp<-I(d$race=="Q")
d$white<-I(d$race=="W")
d$other<-I(d$race%in%c("Z","I","U"))
##drop obvs with unknown race





dim(d)

d$race2<-NA
d$race2[d$asian==T]<-"asian"
d$race2[d$black==T]<-"black"
d$race2[d$hisp==T]<-"hisp"
d$race2[d$white==T]<-"white"
d$race2[d$other==T]<-"other"
d$race2<-factor(d$race2, levels=c("white","black","hisp","asian","other"))
table(d$race2)


d2<-d[,c("year","race2","force")]


##try to replicate table 2A, no controls

m<-glm(I(force>0)~race2, data=d, family = "binomial"(link="logit"))
mean(m$model$force[m$model$race2=="white"])
exp(m$coefficients)





table(d$sex)
d$gender<-NA
d$gender[d$sex=="F"]<-1
d$gender[d$sex=="M"]<-0
table(d$gender)

library(date)


head(d$datestop)
table(nchar(d$datestop))

table(d$datestop[nchar(d$datestop)==10])
table(d$datestop[nchar(d$datestop)==7])
table(d$datestop[nchar(d$datestop)==8])


d$stop_year<-NA
d$stop_year[nchar(d$datestop)==7]<-substr(start=nchar(d$datestop[nchar(d$datestop)==7])-3, stop=nchar(d$datestop[nchar(d$datestop)==7]), x=d$datestop[nchar(d$datestop)==7] )
d$stop_year[nchar(d$datestop)==8]<-substr(start=nchar(d$datestop[nchar(d$datestop)==8])-3, stop=nchar(d$datestop[nchar(d$datestop)==8]), x=d$datestop[nchar(d$datestop)==8] )
d$stop_year[nchar(d$datestop)==10]<-substr(start=1, stop=4, x=d$datestop[nchar(d$datestop)==10] )
d$stop_year<-as.numeric(d$stop_year)
table(d$stop_year, exclude=NULL)

d$stop_month<-NA
d$stop_month[nchar(d$datestop)==7 & !is.na(d$datestop)]<-substr(start=1, stop=1, x=d$datestop[nchar(d$datestop)==7& !is.na(d$datestop)] )
d$stop_month[nchar(d$datestop)==8& !is.na(d$datestop)]<-substr(start=1, stop=2, x=d$datestop[nchar(d$datestop)==8& !is.na(d$datestop)] )
d$stop_month[nchar(d$datestop)==10& !is.na(d$datestop)]<-substr(start=6, stop=7, x=d$datestop[nchar(d$datestop)==10& !is.na(d$datestop)] )
d$stop_month<-as.numeric(d$stop_month)
table(d$stop_month)


d$stop_day<-NA
d$stop_day[ nchar(d$datestop)==7 &!is.na(d$datestop)]<-substr(start=2, stop=3, x=d$datestop[nchar(d$datestop)==7& !is.na(d$datestop)] )
d$stop_day[ nchar(d$datestop)==8 &!is.na(d$datestop)]<-substr(start=3, stop=4, x=d$datestop[nchar(d$datestop)==8& !is.na(d$datestop)] )
d$stop_day[nchar(d$datestop)==10& !is.na(d$datestop)]<-substr(start=9, stop=10, x=d$datestop[nchar(d$datestop)==10& !is.na(d$datestop)] )
d$stop_day<-as.numeric(d$stop_day)
table(d$stop_day)


d$stop_date<-mdy.date(year=d$stop_year, month=d$stop_month, day=d$stop_day)
head(d$stop_date)
d$dob2<-mdy.date(year=as.numeric(d$birth_year), month=as.numeric(d$birth_month), day=as.numeric(d$birth_day))
head(d$dob2)



summary(d$age)
table(d$age)

table(d$dob[d$age<10|d$age>90], exclude=NULL)

table(nchar(d$dob[d$age<10|d$age>90]))
length(d$dob[d$age<10|d$age>90])
head(d$dob[d$age<10|d$age>90 & nchar(d$dob)==7])



d$age2<-as.numeric(as.character(d$age))
summary(d$age2)
##try to recover these using dob
d$age3<-round((d$stop_date - d$dob2)/365,0)
head(d$age3)
d$age2[d$age<10 & !is.na(d$age)]<-d$age3[d$age<10 & !is.na(d$age)]
d$age2[d$age>90& !is.na(d$age)]<-d$age3[d$age>90& !is.na(d$age)]
table(d$age2)
d$age2[d$age2<10]<-NA
d$age2[d$age2>90]<-NA
table(d$age2)


table(I(d$timestop==" "))
table(I(d$timestop=="  "))
table(I(d$timestop=="   "))


table(d$timestop, exclude=NULL)
d$timestop2<-as.character(d$timestop)
table(is.na(d$timestop2))


d$timestop2[d$timestop=="00::1" ]<-"00:01"
d$timestop2[d$timestop=="00:0" ]<-"00:00"


d$timestop2[str_detect(":", d$timestop2)==F & nchar(d$timestop2)==4 &!is.na(d$timestop2)]<-gsub('^(.{2})(.*)$', '\\1:\\2', d$timestop2[str_detect(":", d$timestop2)==F & nchar(d$timestop2)==4 &!is.na(d$timestop2)])

d$timestop2[str_detect(":", d$timestop2)==F & nchar(d$timestop2)==3 &!is.na(d$timestop2)]<-gsub('^(.{1})(.*)$', '\\1:\\2', d$timestop2[str_detect(":", d$timestop2)==F & nchar(d$timestop2)==3 &!is.na(d$timestop2)])



d$timestop2<-gsub("::",":", d$timestop2)

d$timestop2[d$timestop2==" "]<-NA

table(is.na(d$timestop2))
head(d$timestop[is.na(d$timestop2)],50)



##fix back end
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="0:"]<-"00:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="00:"]<-"00:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="1:"]<-"01:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="01:"]<-"01:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="2:"]<-"02:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="02:"]<-"02:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="3:"]<-"03:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="03:"]<-"03:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="4:"]<-"04:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="04:"]<-"04:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="5:"]<-"05:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="05:"]<-"05:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="6:"]<-"06:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="06:"]<-"06:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="7:"]<-"07:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="07:"]<-"07:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="8:"]<-"08:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="08:"]<-"08:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="9:"]<-"09:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="09:"]<-"09:00"

d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="10:"]<-"10:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="11:"]<-"11:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="12:"]<-"12:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="13:"]<-"13:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="14:"]<-"14:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="15:"]<-"15:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="16:"]<-"16:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="17:"]<-"17:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="18:"]<-"18:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="19:"]<-"19:00"
d$timestop2[substr(start=1,stop=nchar(d$timestop2),x=d$timestop2)=="20:"]<-"20:00"



##fix front end
d$timestop2[substr(start=2,stop=2,d$timestop2)==":" & !is.na(d$timestop2)]<-paste("0",d$timestop2[substr(start=2,stop=2,d$timestop2)==":" & !is.na(d$timestop2)],sep="")

names(table(d$timestop2)[order(names(table(d$timestop2)))])




##sub zero in third character of certain strings
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4)=="1"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="1"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="2"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="2"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="3"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="3"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="4"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="4"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="5"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="5"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="6"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="6"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="7"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="7"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="8"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="8"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="9"& !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="9"& !is.na(d$timestop2)])
d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="0" & !is.na(d$timestop2)]<-gsub('^(.{3})(.*)$', '\\10\\2', d$timestop2[nchar(d$timestop2)==4 & substr(d$timestop2, start=4, stop=4 )=="0"& !is.na(d$timestop2)])




#old<-"09:4"
#gsub('^(.{3})(.*)$', '\\1:\\2', old)


names(table(d$timestop2)[order(names(table(d$timestop2)))])



d$timestop2<-gsub("l","0",d$timestop2)
d$timestop2<-gsub("o0","00",d$timestop2)
d$timestop2<-gsub(" 3","03",d$timestop2)
d$timestop2<-gsub("L","0",d$timestop2)
d$timestop2<-gsub("L","0",d$timestop2)
d$timestop2<-gsub("!","0",d$timestop2)
d$timestop2<-gsub("12:\"0","12:00",d$timestop2)
d$timestop2<-gsub("=2","02",d$timestop2)
d$timestop2<-gsub("\\.3","03",d$timestop2)


names(table(d$timestop2)[order(names(table(d$timestop2)))])


##date
table(nchar(d$datestop))
head(d$datestop[nchar(d$datestop)==10])

d$year<-NA
d$year[nchar(d$datestop)==8]<-substr( start=5, stop= 8, x=d$datestop[nchar(d$datestop)==8] )
d$year[nchar(d$datestop)==7]<-substr( start=4, stop=7, x=d$datestop[nchar(d$datestop)==7] )
d$year[nchar(d$datestop)==10]<-substr( start=1, stop= 4, x=d$datestop[nchar(d$datestop)==10] )
table(d$year, exclude=NULL)

d$month<-NA
d$month[nchar(d$datestop)==8]<-substr( start=1, stop= 2, x=d$datestop[nchar(d$datestop)==8] )
d$month[nchar(d$datestop)==7]<-substr( start=1, stop=1, x=d$datestop[nchar(d$datestop)==7] )
d$month[nchar(d$datestop)==10]<-substr( start=6, stop= 7, x=d$datestop[nchar(d$datestop)==10] )
d$month[d$month=="1"]<-"01"
d$month[d$month=="2"]<-"02"
d$month[d$month=="3"]<-"03"
d$month[d$month=="4"]<-"04"
d$month[d$month=="5"]<-"05"
d$month[d$month=="6"]<-"06"
d$month[d$month=="7"]<-"07"
d$month[d$month=="8"]<-"08"
d$month[d$month=="9"]<-"09"

table(d$month, exclude=NULL)


d$day<-NA
d$day[nchar(d$datestop)==8]<-substr( start=3, stop= 4, x=d$datestop[nchar(d$datestop)==8] )
d$day[nchar(d$datestop)==7]<-substr( start=2, stop=3, x=d$datestop[nchar(d$datestop)==7] )
d$day[nchar(d$datestop)==10]<-substr( start=9, stop= 10, x=d$datestop[nchar(d$datestop)==10] )
table(d$day, exclude=NULL)

d$date<-paste(d$year, d$month, d$day, d$timestop2, sep="")
d$date<-gsub(":","",d$date)
head(d$date)


d$date2  <- as.POSIXct(d$date, format="%Y%m%d%H%M", tz="America/New_York")
head(d$date2)

nyc <- matrix(c(-74.0059413, 40.7127837), nrow=1)
#for_date <- as.POSIXct("2018-12-23", tz="America/New_York")

d$date_alt<-paste(d$year, d$month, d$day, sep="-")
head(d$date_alt)


d<-d[nchar(d$date_alt)==10,]
d$date3<-as.POSIXct(d$date_alt, tz="America/New_York")
d$sunrise<-sunriset(nyc, d$date3, direction="sunrise", POSIXct.out=TRUE)$time
d$sunset<-sunriset(nyc, d$date3, direction="sunset", POSIXct.out=TRUE)$time



d$daytime<-NA
d$daytime[d$date2<d$sunrise]<-0
d$daytime[d$date2>=d$sunrise & d$date2<d$sunset]<-1
d$daytime[d$date2>=d$sunset ]<-0
table(d$daytime)



table(d$inout)
d$inout2<-NA
d$inout2[d$inout=="I"]<-1
d$inout2[d$inout=="O"]<-0
table(d$inout2)

table(d$ac_incid)
table(d$ac_time)

table(d$offunif)
d$offunif2<-NA
d$offunif2[d$offunif=="Y"]<-1
d$offunif2[d$offunif=="N"]<-0
table(d$offunif2)

table(d$typeofid)
d$typeofid2<-NA
d$typeofid2[d$typeofid=="O"]<-"O"
d$typeofid2[d$typeofid=="P"]<-"P"
d$typeofid2[d$typeofid=="R"]<-"R"
d$typeofid2[d$typeofid=="V"]<-"V"
table(d$typeofid2)


table(d$othpers)
d$othpers2<-NA
d$othpers2[d$othpers=="Y"]<-1
d$othpers2[d$othpers=="N"]<-0
table(d$othpers2)

table(d$pct)
d$pct[d$pct=="999"]<-NA
table(d$pct)


##suspect behavior
behav<-c("sb_hdobj" , "sb_outln" , "sb_admis" , "sb_other" ,"rf_attir" , "cs_objcs",  "cs_descr" , "cs_casng" , "cs_lkout",
 "rf_vcact",  "cs_cloth" , "cs_drgtr"  ,"ac_evasv" , "ac_assoc"  ,"cs_furtv"  ,"rf_rfcmp" , "ac_cgdir",  "rf_verbl",  "cs_vcrim" , "cs_bulge"  ,"cs_other")

 for(i in 1:length(behav)){

 	print(table(d[,behav[i]], exclude=NULL))

 }

table(d$year)



d2<-d[,c("force","force2","race2","gender","age2","daytime","inout2","ac_incid","ac_time","offunif2","typeofid2","othpers2", "cs_objcs", "cs_descr","cs_casng","cs_lkout", "cs_cloth", "cs_drgtr", "cs_furtv", "cs_vcrim", "cs_bulge","cs_other","pct","year")]





#"sb_hdobj","sb_outln","sb_admis","sb_other","rf_attir","rf_vcact","ac_evasv","ac_assoc","rf_rfcmp","ac_cgdir","rf_verbl", "rf_attir",

dim(d2)


dim(na.omit(d2))
dim(na.omit(d2[,colnames(d2)!="daytime"]))


###
### fryer tabble 2A: The first row includes
#  solely racial group dummies. The second row adds controls for gender and a quadratic in age. The third row adds controls for whether the stop was indoors or outdoors, whether the stop took place during the daytime, whether the stop took place in a high crime area or during a high crime time, whether the officer was in uniform, civilian ID type, and whether others were stopped during the interaction. The fourth row adds controls for civilian behavior. The fifth row adds precinct and year fixed effects. Each row includes missings in all variables. Standard errors clustered at the precinct level are reported in parentheses.



save(d2, file="data/sqf_fryer_2003_2013.Rdata")
